Library

library(tidyverse)
library(here)      
library(readxl)    
library(janitor)   
library(stringr)   
library(tidyr)     
library(dplyr)
library(knitr)
library(ggplot2)
library(factoextra)
library(gt)
library(naniar)
library(ggcorrplot)
library(corrplot)
library(GGally)
library(randomForest)
library(e1071)  
library(caret)
library(plotly)
library(stargazer)
library(broom)
library(kableExtra)
library(gridExtra)
library(patchwork)
library(minerva)
library(mgcv)
library(MASS)
library(glmnet)
library(purrr)
library(insight)
library(grid)

Data Import

raw_bp <- read.csv(here("ZIPallFiles","AHSnpa11bp.csv"), header=TRUE)
raw_bb <- read.csv(here("ZIPallFiles","AHSnpa11bb.csv"), header=TRUE)
raw_bsp <- read.csv(here("ZIPallFiles","inp13bsp.csv"), header=TRUE)
raw_bcn <- read.csv(here("ZIPallFiles","inp13bcn.csv"), header=TRUE)
raw_bhh <- read.csv(here("ZIPallFiles","inp13bhh.csv"), header=TRUE)

Rename and Created

bp_bb <- inner_join(raw_bp, raw_bb, by = "ABSPID") %>%
  rename(HypertensionStatus = HYPBC,
         BodyMass = SABDYMS,
         SocialStatus = SF2SA1QN,
         PersonID = ABSPID,
         Age = AGEC,
         Gender = SEX,
         BMI = BMISC,
         Systolic = SYSTOL,
         Diastolic = DIASTOL,
         SmokeStatus = SMKSTAT,
         AlcoholPercentage_Day1 = ALCPER1,
         AlcoholPercentage_Day2 = ALCPER2,
         Potassium_Day1 = POTAST1,
         Potassium_Day2 = POTAST2,
         Sodium_Day1 = SODIUMT1,
         Sodium_Day2 = SODIUMT2,
         FibreEnergy_Day1 = FIBRPER1,
         FibreEnergy_Day2 = FIBRPER2,
         CarbohydrateEnergy_Day1 = CHOPER1,
         CarbohydrateEnergy_Day2 = CHOPER2,
         EnergyBMR_Day1 = EIBMR1,
         EnergyBMR_Day2 = EIBMR2) %>%
   mutate(Identity = factor(0))

bsp_bhh<- left_join(raw_bsp, raw_bhh,by = c("ABSHID")) %>%
  rename(BodyMass = SABDYMS,
         SocialStatus = SF2SA1DB,
         HouseID = ABSHID,
         Age = AGEEC,
         Gender = SEX,
         BMI = BMISC,
         Systolic = SYSTOL,
         Diastolic = DIASTOL,
         SmokeStatus = SMKSTAT,
         AlcoholPercentage_Day1 = ALCPER1,
         AlcoholPercentage_Day2 = ALCPER2,
         Potassium_Day1 = POTAST1,
         Potassium_Day2 = POTAST2,
         Sodium_Day1 = SODIUMT1,
         Sodium_Day2 = SODIUMT2,
         FibreEnergy_Day1 = FIBRPER1,
         FibreEnergy_Day2 = FIBRPER2,
         CarbohydrateEnergy_Day1 = CHOPER1,
         CarbohydrateEnergy_Day2 = CHOPER2,
         EnergyBMR_Day1 = EIBMR1,
         EnergyBMR_Day2 = EIBMR2)%>%
   mutate(Identity = factor(1))

bcn <- raw_bcn %>%
  rename(HouseID = ABSHID,
         MedicalCondition = ICD10ME)

First Filter

bp_bb <- bp_bb %>%
  mutate(EnergyBMR_Day1 = na_if(EnergyBMR_Day1, 997) %>% na_if(998),
         EnergyBMR_Day2 = na_if(EnergyBMR_Day2, 997) %>% na_if(998),
         EnergyBMR = (EnergyBMR_Day1 + EnergyBMR_Day2)/2) %>%
  filter(
    Age >= 18, 
    HypertensionStatus == 5,
    BodyMass != 4,
    (is.na(EnergyBMR) | EnergyBMR >= 0.9))

bsp_bhh <- bsp_bhh %>% 
  mutate(EnergyBMR_Day1 = na_if(EnergyBMR_Day1, 997) %>% na_if(998),
         EnergyBMR_Day2 = na_if(EnergyBMR_Day2, 997) %>% na_if(998),
         EnergyBMR = (EnergyBMR_Day1 + EnergyBMR_Day2)/2) %>%
  filter(
    Age >= 18, 
    BodyMass != 4,
    !HouseID %in% bcn$HouseID[bcn$MedicalCondition %in% c(7, 20)],
    (is.na(EnergyBMR) | EnergyBMR >= 0.9))

Merge

raw_data <- bind_rows(bp_bb,bsp_bhh)

Variable Selection

selected_data <- raw_data %>%
  dplyr::select(PersonID, HouseID, SocialStatus, Age, Gender, BMI, Systolic, Diastolic, SmokeStatus, Identity,
         AlcoholPercentage_Day1, AlcoholPercentage_Day2,
         Potassium_Day1, Potassium_Day2, Sodium_Day1, Sodium_Day2,
         FibreEnergy_Day1, FibreEnergy_Day2, 
         CarbohydrateEnergy_Day1, CarbohydrateEnergy_Day2)

Type Check 1

data.frame(Variable = names(selected_data), Type = sapply(selected_data, class)) %>%
  gt() %>%
  tab_header(
    title = "Variable Names and Their Types"
  ) %>%
 tab_caption(caption = md("Variable types table"))
Variable types table
Variable Names and Their Types
Variable Type
PersonID character
HouseID character
SocialStatus integer
Age integer
Gender integer
BMI numeric
Systolic integer
Diastolic integer
SmokeStatus integer
Identity factor
AlcoholPercentage_Day1 numeric
AlcoholPercentage_Day2 numeric
Potassium_Day1 numeric
Potassium_Day2 numeric
Sodium_Day1 numeric
Sodium_Day2 numeric
FibreEnergy_Day1 numeric
FibreEnergy_Day2 numeric
CarbohydrateEnergy_Day1 numeric
CarbohydrateEnergy_Day2 numeric

Na values + New Variables + Type Conversion

selected_data <- selected_data %>%
  mutate(
    across(c(Gender, SocialStatus, SmokeStatus), as.factor),
    BMI = na_if(BMI, 98) %>% na_if(99),
    Systolic = na_if(Systolic, 0) %>% na_if(998) %>% na_if(999),
    Diastolic = na_if(Diastolic, 0) %>% na_if(998) %>% na_if(999),
    Potassium = (Potassium_Day1 + Potassium_Day2) / 2, 
    Sodium = (Sodium_Day1 + Sodium_Day2) / 2,
    FibreEnergy = (FibreEnergy_Day1 + FibreEnergy_Day2)/2,
    CarbohydrateEnergy = (CarbohydrateEnergy_Day1 + CarbohydrateEnergy_Day2) / 2,
    AlcoholPercentage = (AlcoholPercentage_Day1 + AlcoholPercentage_Day2) /2,
    SodiumPotassiumRatio = Sodium / Potassium
    ) %>% 
  dplyr::select(PersonID,HouseID, SocialStatus, Age, Gender, BMI, Systolic, Diastolic, SmokeStatus, AlcoholPercentage, SodiumPotassiumRatio, FibreEnergy, CarbohydrateEnergy, Identity)

Type Check 2

data.frame(Variable = names(selected_data), Type = sapply(selected_data, class)) %>%
  gt() %>%
  tab_header(
    title = "Variable Names and Their Types"
  ) %>%
 tab_caption(caption = md("Variable types table after correction"))
Variable types table after correction
Variable Names and Their Types
Variable Type
PersonID character
HouseID character
SocialStatus factor
Age integer
Gender factor
BMI numeric
Systolic integer
Diastolic integer
SmokeStatus factor
AlcoholPercentage numeric
SodiumPotassiumRatio numeric
FibreEnergy numeric
CarbohydrateEnergy numeric
Identity factor

Duplicate Values

selected_data %>% 
  dplyr::select(-PersonID) %>% 
  distinct() %>% 
  dim() %>% 
  tibble::tibble(Dimension = c("Rows", "Columns"), Count = .) %>% 
  gt() %>%
  tab_header(title = "Dimensions of Unique Tibble") %>%
  tab_caption(caption = md("Dimensions of Unique Data Table"))
Dimensions of Unique Data Table
Dimensions of Unique Tibble
Dimension Count
Rows 8449
Columns 13

Categrocial Distribution

plot_categorical_distribution <- function(data) {
  categorical_vars <- data %>%
    dplyr::select(where(is.factor) | where(is.character)) %>%
    dplyr::select(-PersonID, -HouseID)
  figure_counter <- 1
  for (var in names(categorical_vars)) {
    non_na_data <- data %>%
      filter(!is.na(.data[[var]]))
    if (nrow(non_na_data) == 0) next
    
    figure_title <- paste("Proportion of Categories in ", var, sep = "")
    
    p <- ggplot(non_na_data, aes(x = .data[[var]])) +       
      geom_bar(aes(y = (..count..) / sum(..count..), fill = .data[[var]]), color = "black") +       
      scale_fill_brewer(palette = "Set3") +        
      labs(title = figure_title, x = "Category", y = "Proportion") +       
      theme_minimal() +       
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    print(p)
    figure_counter <- figure_counter + 1
    }
  }

plot_categorical_distribution(selected_data) 

Merge Variable

selected_data <- selected_data %>% 
  mutate(SmokeStatus = recode_factor(SmokeStatus, "2" = "1", "3" = "1","4" = "2","5" = "3"))

Outliers and New Variables

normal_ranges <- list(Systolic = c(90, 180), Diastolic = c(60, 120), BMI = c(16, 47.5), 
                      SodiumPotassiumRatio = c(0, 4), AlcoholPercentage = c(0, 85), 
                      CarbohydrateEnergy = c(0, 100), FibreEnergy = c(0, 100))

selected_data <- selected_data %>%
  filter(
    (is.na(Systolic) | (Systolic >= normal_ranges$Systolic[1] & Systolic <= normal_ranges$Systolic[2])),
    (is.na(Diastolic) | (Diastolic >= normal_ranges$Diastolic[1] & Diastolic <= normal_ranges$Diastolic[2])),
    (is.na(BMI) | (BMI >= normal_ranges$BMI[1] & BMI <= normal_ranges$BMI[2])),
    (is.na(AlcoholPercentage) | (AlcoholPercentage >= normal_ranges$AlcoholPercentage[1] & AlcoholPercentage <= normal_ranges$AlcoholPercentage[2])),
    (is.na(CarbohydrateEnergy) | (CarbohydrateEnergy >= normal_ranges$CarbohydrateEnergy[1] & CarbohydrateEnergy <= normal_ranges$CarbohydrateEnergy[2])),
    (is.na(FibreEnergy) | (FibreEnergy >= normal_ranges$FibreEnergy[1] & FibreEnergy <= normal_ranges$FibreEnergy[2])),
    (is.na(SodiumPotassiumRatio) | (SodiumPotassiumRatio >= normal_ranges$SodiumPotassiumRatio[1] & SodiumPotassiumRatio <= normal_ranges$SodiumPotassiumRatio[2]))
  ) %>%
  mutate(Hypertension = as.factor(case_when(Systolic >= 120 | Diastolic >= 80 ~ 1,TRUE ~ 0)))

Variable check

variable_data <- data.frame(
  Variable = c("PersonID", "HouseID", "SocialStatus", "Age", "Gender", "BMI", 
               "Systolic", "Diastolic", "SmokeStatus", "AlcoholPercentage", 
               "SodiumPotassiumRatio", "FibreEnergy", "CarbohydrateEnergy", 
               "Identity"),
  Description = c("Selected person identifier", "Household identifier",
                  "ICD10 Conditions - Mini classification",
                  "Age of person", "Sex of person",
                  "Body Mass Index (BMI) - score measured",
                  "Systolic Blood Pressure (mmHg)",
                  "Diastolic Blood Pressure (mmHg)", "Smoking Status",
                  "Average of Percentage of energy from alcohol (Day1) and Percentage of energy from alcohol (Day2)",
                  "The ratio of the average of Sodium (supplement) Day 1 mg and Sodium (supplement) Day 2 mg over the average of Potassium (total) Day 1 mg and Potassium (total) Day 2 mg",
                  "Average of Percentage of energy from fibre (Day1) and Percentage of energy from fibre (Day2)",
                  "Average of Percentage of energy from carbohydrate (Day1) and Percentage of energy from carbohydrate (Day2)",
                  "Indicates whether the individual is part of the indigenous population"))

variable_table <- variable_data %>%
  gt() %>%
  tab_header(title = "All related variables") %>%
  cols_label(Variable = "Variable",Description = "Description") %>%
  tab_caption(caption = md("Variable Descriptions"))%>%
  fmt_markdown(columns = everything()) %>%
  tab_options(table.width = pct(100))

variable_table
Variable Descriptions
All related variables
Variable Description
PersonID Selected person identifier
HouseID Household identifier
SocialStatus ICD10 Conditions - Mini classification
Age Age of person
Gender Sex of person
BMI Body Mass Index (BMI) - score measured
Systolic Systolic Blood Pressure (mmHg)
Diastolic Diastolic Blood Pressure (mmHg)
SmokeStatus Smoking Status
AlcoholPercentage Average of Percentage of energy from alcohol (Day1) and Percentage of energy from alcohol (Day2)
SodiumPotassiumRatio The ratio of the average of Sodium (supplement) Day 1 mg and Sodium (supplement) Day 2 mg over the average of Potassium (total) Day 1 mg and Potassium (total) Day 2 mg
FibreEnergy Average of Percentage of energy from fibre (Day1) and Percentage of energy from fibre (Day2)
CarbohydrateEnergy Average of Percentage of energy from carbohydrate (Day1) and Percentage of energy from carbohydrate (Day2)
Identity Indicates whether the individual is part of the indigenous population

This table may require table number as it may be placed in Methodology.

selected_predictors <- selected_data %>%
  dplyr::select(FibreEnergy, CarbohydrateEnergy)

predictor_table <- data.frame(Variable = names(selected_predictors), Type = sapply(selected_predictors, class)) %>%
  gt() %>%
  tab_header(title = "Variable Names and Their Types") %>%
  tab_caption(caption = md("Predictor variables and their data types"))

predictor_table
Predictor variables and their data types
Variable Names and Their Types
Variable Type
FibreEnergy numeric
CarbohydrateEnergy numeric
confounding_vars <- selected_data %>%
  dplyr::select(-FibreEnergy, -CarbohydrateEnergy,-PersonID, -HouseID)

confounding_table <- data.frame(Variable = names(confounding_vars), Type = sapply(confounding_vars, class)) %>%
  gt() %>%
  tab_header(title = "Confounding Variables and Their Types") %>%
  tab_caption(caption = md("Confounding variables and their data types"))

confounding_table
Confounding variables and their data types
Confounding Variables and Their Types
Variable Type
SocialStatus factor
Age integer
Gender factor
BMI numeric
Systolic integer
Diastolic integer
SmokeStatus factor
AlcoholPercentage numeric
SodiumPotassiumRatio numeric
Identity factor
Hypertension factor
format_counts <- function(count, total) {
  paste(count, "(", round(count / total * 100, 1), "%)", sep = "")
}

format_counts_list <- function(values, data, column) {
  sapply(values, function(x) format_counts(sum(data[[column]] == x), nrow(data)))
}

format_mean_sd <- function(mean_val, sd_val) {
  paste(round(mean_val, 2), "(", round(sd_val, 2), ")", sep = "")
}

format_mean_sd_list <- function(data, columns) {
  sapply(columns, function(col) format_mean_sd(mean(data[[col]], na.rm = TRUE), sd(data[[col]], na.rm = TRUE)))
}

get_social_status <- function(data) {
  format_counts_list(1:5, data, "SocialStatus")
}

get_gender <- function(data) {
  format_counts_list(1:2, data, "Gender")
}

get_smoke_status <- function(data) {
  format_counts_list(1:3, data, "SmokeStatus")
}

get_identity <- function(data) {
  format_counts_list(0:1, data, "Identity")
}

get_age <- function(data) {
  c(
    format_counts(sum(data$Age >= 18 & data$Age <= 49), nrow(data)),
    format_counts(sum(data$Age > 50), nrow(data))
  )
}

get_health_metrics <- function(data) {
  format_mean_sd_list(data, c("BMI", "Systolic", "Diastolic", "AlcoholPercentage", "SodiumPotassiumRatio", "FibreEnergy", "CarbohydrateEnergy"))
}

riskdata <- selected_data %>% filter(Hypertension == 1)
noriskdata <- selected_data %>% filter(Hypertension == 0)

df <- data.frame(
  Variable = c(
    "Lowest 20%, N(%)", "Second quintile, N(%)", "Third quintile, N(%)", "Fourth quintile, N(%)", "Highest 20%*, N(%)",
    "Male, N(%)", "Female, N(%)",
    "Currently Smoker, N(%)", "Ex-Smoker, N(%)", "Never smoked, N(%)",
    "Non Indigenous, N(%)", "Indigenous, N(%)",
    "18-49, N(%)", ">50, N(%)",
    "BMI, kg/m² , mean(SD)", 
    "Systolic, mmHg, mean(SD)", "Diastolic, mmHg, mean(SD)", 
    "Alcohol Percentage, mean(SD)", 
    "Sodium Potassium Ratio, mean(SD)", 
    "Fibre Energy, mean(SD)", 
    "Carbohydrate Energy, mean(SD)"
  ),
  Group = c(
    "SocialStatus", "SocialStatus", "SocialStatus", "SocialStatus", "SocialStatus",
    "Gender", "Gender",
    "SmokeStatus", "SmokeStatus", "SmokeStatus",
    "Identity", "Identity",
    "Age", "Age",
    NA, NA, NA, NA, NA, NA, NA
  ),
  Total = c(
    get_social_status(selected_data),
    get_gender(selected_data),
    get_smoke_status(selected_data),
    get_identity(selected_data),
    get_age(selected_data),
    get_health_metrics(selected_data)
  ),
  HypertensionRisk = c(
    get_social_status(riskdata),
    get_gender(riskdata),
    get_smoke_status(riskdata),
    get_identity(riskdata),
    get_age(riskdata),
    get_health_metrics(riskdata)
  ),
  NoHypertensionRisk = c(
    get_social_status(noriskdata),
    get_gender(noriskdata),
    get_smoke_status(noriskdata),
    get_identity(noriskdata),
    get_age(noriskdata),
    get_health_metrics(noriskdata)
  )
)

total_count <- nrow(selected_data)
risk_count <- nrow(riskdata)
no_risk_count <- nrow(noriskdata)

df %>%
  format_table(ci_brackets = c("(", ")")) %>%
  export_table(format = "html", title = "Table 1: Demographic and Health Characteristics by Hypertension Status")
Table 1: Demographic and Health Characteristics by Hypertension Status
Variable Total HypertensionRisk NoHypertensionRisk
SocialStatus
Lowest 20%, N(%) 2164(27.3%) 1079(28.7%) 1085(26%)
Second quintile, N(%) 1570(19.8%) 769(20.4%) 801(19.2%)
Third quintile, N(%) 1430(18%) 665(17.7%) 765(18.3%)
Fourth quintile, N(%) 1223(15.4%) 572(15.2%) 651(15.6%)
Highest 20%*, N(%) 1545(19.5%) 678(18%) 867(20.8%)
Gender
Male, N(%) 3639(45.9%) 2001(53.2%) 1638(39.3%)
Female, N(%) 4293(54.1%) 1762(46.8%) 2531(60.7%)
SmokeStatus
Currently Smoker, N(%) 2163(27.3%) 1038(27.6%) 1125(27%)
Ex-Smoker, N(%) 2222(28%) 1112(29.6%) 1110(26.6%)
Never smoked, N(%) 3547(44.7%) 1613(42.9%) 1934(46.4%)
Identity
Non Indigenous, N(%) 6290(79.3%) 2947(78.3%) 3343(80.2%)
Indigenous, N(%) 1642(20.7%) 816(21.7%) 826(19.8%)
Age
18-49, N(%) 5174(65.2%) 2149(57.1%) 3025(72.6%)
>50, N(%) 2597(32.7%) 1537(40.8%) 1060(25.4%)
BMI, kg/m² , mean(SD) 27.04(5.28) 28.18(5.33) 25.71(4.9)
Systolic, mmHg, mean(SD) 120.49(15.92) 130.47(13.58) 107.74(7.21)
Diastolic, mmHg, mean(SD) 77.26(9.73) 82.73(8.91) 70.27(5.2)
Alcohol Percentage, mean(SD) 3.3(6.33) 3.84(6.94) 2.81(5.69)
Sodium Potassium Ratio, mean(SD) 0.93(0.52) 0.91(0.51) 0.94(0.53)
Fibre Energy, mean(SD) 1.62(0.92) 1.61(0.92) 1.64(0.93)
Carbohydrate Energy, mean(SD) 32.57(13) 32.42(13.06) 32.71(12.96)

Missing Values

selected_data %>% 
  dplyr::select(-PersonID, -HouseID) %>% 
  miss_var_summary() %>% 
  gt() %>%
  tab_header(title = "Missing Value") %>%
  tab_caption(caption = md("Missing Values in Selected Data")) %>%
  cols_label(variable = "Variable Name", n_miss = "Missing Count", pct_miss = "Percentage Missing") %>%
  fmt_number(columns = everything(), decimals = 2)
Missing Values in Selected Data
Missing Value
Variable Name Missing Count Percentage Missing
BMI 1,242.00 15.7
Systolic 1,222.00 15.4
Diastolic 1,222.00 15.4
SocialStatus 0.00 0
Age 0.00 0
Gender 0.00 0
SmokeStatus 0.00 0
AlcoholPercentage 0.00 0
SodiumPotassiumRatio 0.00 0
FibreEnergy 0.00 0
CarbohydrateEnergy 0.00 0
Identity 0.00 0
Hypertension 0.00 0
corr_matrix <- selected_data%>%
  dplyr::select(where(is.numeric)) %>%
  mutate(across(everything(), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .))) %>%
  cor(use = "complete.obs")

ggcorrplot(corr_matrix, lab = TRUE, 
           title = "Correlation Heatmap for ALL", 
           colors = c("blue", "white", "red"),
           tl.cex = 9, lab_size = 4) + theme(legend.position = "none")

Regression Imputation

selected_data$BMI[is.na(selected_data$BMI)] <- lm(BMI ~ Gender + SocialStatus + Age + SodiumPotassiumRatio + FibreEnergy + CarbohydrateEnergy + AlcoholPercentage + SmokeStatus + Identity, data = selected_data, na.action = na.exclude) %>% predict(newdata = selected_data[is.na(selected_data$BMI), ])

selected_data$Systolic[is.na(selected_data$Systolic)] <- lm(Systolic ~ Gender + SocialStatus + Age + BMI + SodiumPotassiumRatio + FibreEnergy + CarbohydrateEnergy + AlcoholPercentage + SmokeStatus + Identity, data = selected_data, na.action = na.exclude) %>% predict(newdata = selected_data[is.na(selected_data$Systolic), ])

selected_data$Diastolic[is.na(selected_data$Diastolic)] <- lm(Diastolic ~ Gender + SocialStatus + Age + BMI + Systolic + SodiumPotassiumRatio + FibreEnergy + CarbohydrateEnergy + AlcoholPercentage + SmokeStatus + Identity, data = selected_data, na.action = na.exclude) %>% predict(newdata = selected_data[is.na(selected_data$Diastolic), ])

vis_miss(selected_data %>% dplyr::select(-PersonID, -HouseID)) +
  ggtitle("Missing Data Visualization of Selected Variables")

Normalization

numeric_data <- selected_data %>% dplyr::select_if(is.numeric)
preprocess_params <- preProcess(numeric_data, method = c("center", "scale"))
scaled_numeric_data <- predict(preprocess_params, numeric_data)
normalized_data <- bind_cols(scaled_numeric_data, selected_data %>% dplyr::select_if(Negate(is.numeric)))

Multicollinearity

selected_data %>%
  dplyr::select(where(is.numeric), -Systolic, -Diastolic) %>%
  ggscatmat() +
  ggtitle("Scatterplot Matrix of Selected Numeric Variables")

Correlation

Scatter plot

Fibre_syst_plot <- ggplot(selected_data, aes(x = FibreEnergy, y = Systolic)) +
  geom_point(color = "#FF7F0E", alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "#1F77B4", size = 1.5) +
  labs(x = "Percentage of Energy from Fibre", y = "Systolic (mmHg)") +
  theme_light(base_size = 15) +
  theme(plot.title = element_text(hjust = 0.5)) +
  annotate("text", x = Inf, y = Inf, label = "A", hjust = 1.5, vjust = 1.5, size = 6, fontface = "bold")

Fibre_dias_plot <- ggplot(selected_data, aes(x = FibreEnergy, y = Diastolic)) +
  geom_point(color = "#FF7F0E", alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "#1F77B4", size = 1.5) +
  labs(x = "Percentage of Energy from Fibre", y = "Diastolic (mmHg)") +
  theme_light(base_size = 15) +
  theme(plot.title = element_text(hjust = 0.5)) +
  annotate("text", x = Inf, y = Inf, label = "B", hjust = 1.5, vjust = 1.5, size = 6, fontface = "bold")

carbo_syst_plot <- ggplot(selected_data, aes(x = CarbohydrateEnergy, y = Systolic)) +
  geom_point(color = "#2CA02C", alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "#D62728", size = 1.5) +
  labs(x = "Percentage of Energy from Carbohydrate", y = "Systolic (mmHg)") +
  theme_light(base_size = 15) +
  theme(plot.title = element_text(hjust = 0.5)) +
  annotate("text", x = Inf, y = Inf, label = "C", hjust = 1.5, vjust = 1.5, size = 6, fontface = "bold")

carbo_dias_plot <- ggplot(selected_data, aes(x = CarbohydrateEnergy, y = Diastolic)) +
  geom_point(color = "#2CA02C", alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "#D62728", size = 1.5) +
  labs(x = "Percentage of Energy from Carbohydrate", y = "Diastolic (mmHg)") +
  theme_light(base_size = 15) +
  theme(plot.title = element_text(hjust = 0.5)) +
  annotate("text", x = Inf, y = Inf, label = "D", hjust = 1.5, vjust = 1.5, size = 6, fontface = "bold")

explanation <- textGrob(
  label = "A. Scatter plot of Fibre Energy vs Systolic                 B. Scatter plot of Fibre Energy vs Diastolic\nC. Scatter plot of Carbohydrate Energy vs Systolic   D. Scatter plot of Carbohydrate Energy vs Diastolic",
  x = unit(0, "npc"), y = unit(0.1, "npc"), just = c("left", "bottom"),
  gp = gpar(fontsize = 12, fontface = "italic")
)

grid.arrange(
  arrangeGrob(
    Fibre_syst_plot, Fibre_dias_plot, carbo_syst_plot, carbo_dias_plot,
    nrow = 2,
    top = textGrob("Figure 1: Relationship Between Energy Intake from Fibre and Carbohydrate and Blood Pressure", 
                   gp = gpar(fontsize = 16))
  ),
  explanation,
  heights = unit.c(unit(1, "npc") - unit(3, "lines"), unit(3, "lines"))
)

Linear correlation table

Fibre_syst_pearson <- cor(selected_data$FibreEnergy, selected_data$Systolic, method = "pearson")
Fibre_syst_spearman <- cor(selected_data$FibreEnergy, selected_data$Systolic, method = "spearman")
Fibre_dias_pearson <- cor(selected_data$FibreEnergy, selected_data$Diastolic, method = "pearson")
Fibre_dias_spearman <- cor(selected_data$FibreEnergy, selected_data$Diastolic, method = "spearman")

carbo_syst_pearson <- cor(selected_data$CarbohydrateEnergy, selected_data$Systolic, method = "pearson")
carbo_syst_spearman <- cor(selected_data$CarbohydrateEnergy, selected_data$Systolic, method = "spearman")
carbo_dias_pearson <- cor(selected_data$CarbohydrateEnergy, selected_data$Diastolic, method = "pearson")
carbo_dias_spearman <- cor(selected_data$CarbohydrateEnergy, selected_data$Diastolic, method = "spearman")

correlation_table <- data.frame(
  Measure = c("FibreEnergy vs Systolic", "FibreEnergy vs Diastolic",
              "CarbohydrateEnergy vs Systolic", "CarbohydrateEnergy vs Diastolic"),
  Pearson_Correlation = c(Fibre_syst_pearson, Fibre_dias_pearson, carbo_syst_pearson, carbo_dias_pearson),
  Spearman_Correlation = c(Fibre_syst_spearman, Fibre_dias_spearman, carbo_syst_spearman, carbo_dias_spearman))

correlation_table %>%
  gt() %>%
  tab_header(
    title = "Table 2: Correlation Between Nutrient Energy Intake and Blood Pressure",
    subtitle = "Correlation values for both Systolic and Diastolic Blood Pressure") %>%
  fmt_number(columns = c(Pearson_Correlation, Spearman_Correlation), decimals = 2)
Table 2: Correlation Between Nutrient Energy Intake and Blood Pressure
Correlation values for both Systolic and Diastolic Blood Pressure
Measure Pearson_Correlation Spearman_Correlation
FibreEnergy vs Systolic 0.02 0.02
FibreEnergy vs Diastolic −0.09 −0.09
CarbohydrateEnergy vs Systolic −0.04 −0.05
CarbohydrateEnergy vs Diastolic −0.09 −0.10

Non-linear correlation table

Fibre_syst_kendall <- cor(selected_data$FibreEnergy, selected_data$Systolic, method = "kendall")
Fibre_syst_mic <- mine(selected_data$FibreEnergy, selected_data$Systolic)$MIC
Fibre_dias_kendall <- cor(selected_data$FibreEnergy, selected_data$Diastolic, method = "kendall")
Fibre_dias_mic <- mine(selected_data$FibreEnergy, selected_data$Diastolic)$MIC
carbo_syst_kendall <- cor(selected_data$CarbohydrateEnergy, selected_data$Systolic, method = "kendall")
carbo_syst_mic <- mine(selected_data$CarbohydrateEnergy, selected_data$Systolic)$MIC
carbo_dias_kendall <- cor(selected_data$CarbohydrateEnergy, selected_data$Diastolic, method = "kendall")
carbo_dias_mic <- mine(selected_data$CarbohydrateEnergy, selected_data$Diastolic)$MIC

correlation_table_mic_kendall <- data.frame(
  Measure = c("FibreEnergy vs Systolic", "FibreEnergy vs Diastolic",
              "CarbohydrateEnergy vs Systolic", "CarbohydrateEnergy vs Diastolic"),
  Kendall_Correlation = c(Fibre_syst_kendall, Fibre_dias_kendall, carbo_syst_kendall, carbo_dias_kendall),
  MIC_Correlation = c(Fibre_syst_mic, Fibre_dias_mic, carbo_syst_mic, carbo_dias_mic))

correlation_table_mic_kendall %>%
  gt() %>%
  tab_header(
    title = "Table 3: Kendall's Tau and MIC Correlation between Nutrient Energy Intake and Blood Pressure") %>%
  fmt_number(columns = c(Kendall_Correlation, MIC_Correlation), decimals = 3)
Table 3: Kendall's Tau and MIC Correlation between Nutrient Energy Intake and Blood Pressure
Measure Kendall_Correlation MIC_Correlation
FibreEnergy vs Systolic 0.010 0.044
FibreEnergy vs Diastolic −0.063 0.058
CarbohydrateEnergy vs Systolic −0.033 0.059
CarbohydrateEnergy vs Diastolic −0.070 0.072

Hypothesis testing

median_Fibre <- median(selected_data$FibreEnergy)
selected_data$FibreGroup <- ifelse(selected_data$FibreEnergy > median_Fibre, "High Fibre", "Low Fibre")

median_carb <- median(selected_data$CarbohydrateEnergy)
selected_data$CarboGroup <- ifelse(selected_data$CarbohydrateEnergy > median_carb, "High Carbo", "Low Carbo")
Fibre_systolic_anova <- aov(Systolic ~ FibreGroup, data = selected_data)
Fibre_diastolic_anova <- aov(Diastolic ~ FibreGroup, data = selected_data)
carbo_systolic_anova <- aov(Systolic ~ CarboGroup, data = selected_data)
carbo_diastolic_anova <- aov(Diastolic ~ CarboGroup, data = selected_data)

anova_results <- data.frame(
  Measure = c("Fibre vs Systolic", "Fibre vs Diastolic", 
              "Carbohydrate vs Systolic", "Carbohydrate vs Diastolic"),
  F_Statistic = c(summary(Fibre_systolic_anova)[[1]][["F value"]][1],
                  summary(Fibre_diastolic_anova)[[1]][["F value"]][1],
                  summary(carbo_systolic_anova)[[1]][["F value"]][1],
                  summary(carbo_diastolic_anova)[[1]][["F value"]][1]),
  P_Value = c(summary(Fibre_systolic_anova)[[1]][["Pr(>F)"]][1],
              summary(Fibre_diastolic_anova)[[1]][["Pr(>F)"]][1],
              summary(carbo_systolic_anova)[[1]][["Pr(>F)"]][1],
              summary(carbo_diastolic_anova)[[1]][["Pr(>F)"]][1])
)

anova_results %>%
  gt() %>%
  tab_header(
    title = "Table 4: ANOVA Test Results: Nutrient Energy Intake vs Blood Pressure"
  ) %>%
  fmt_number(columns = c(F_Statistic, P_Value), decimals = 3) %>%
  cols_label(
    Measure = "Nutrient vs Blood Pressure",
    F_Statistic = "F-Statistic",
    P_Value = "P-Value"
  ) %>%
  tab_options(
    table.font.size = "medium",
    heading.align = "center"
  )
Table 4: ANOVA Test Results: Nutrient Energy Intake vs Blood Pressure
Nutrient vs Blood Pressure F-Statistic P-Value
Fibre vs Systolic 0.842 0.359
Fibre vs Diastolic 41.924 0.000
Carbohydrate vs Systolic 5.385 0.020
Carbohydrate vs Diastolic 52.412 0.000

Model comparison

Systolic

set.seed(3888)
k <- 5
repeats <- 3

gam_rmse_systolic <- numeric()
gam_mae_systolic <- numeric()
gam_r2_systolic <- numeric()

mlr_rmse_systolic <- numeric()
mlr_mae_systolic <- numeric()
mlr_r2_systolic <- numeric()

robust_rmse_systolic <- numeric()
robust_mae_systolic <- numeric()
robust_r2_systolic <- numeric()

n <- nrow(normalized_data)

for (r in 1:repeats) {
  folds <- sample(rep(1:k, length.out = n))
  
  for (i in 1:k) {
    train_idx <- which(folds != i)
    test_idx <- which(folds == i)
    
    data_train <- normalized_data[train_idx, ]
    data_test <- normalized_data[test_idx, ]
    
    actual <- data_test$Systolic
    #GAM
    gam_model <- gam(Systolic ~ s(Age) + s(BMI) + SocialStatus + Gender + SmokeStatus +
                       s(AlcoholPercentage) + s(FibreEnergy) + s(CarbohydrateEnergy) + 
                       Identity + s(SodiumPotassiumRatio),
                     data = data_train)
    
    gam_predictions <- predict(gam_model, newdata = data_test)
    
    gam_rmse <- sqrt(mean((gam_predictions - actual)^2))
    gam_mae <- mean(abs(gam_predictions - actual))
    gam_r2 <- cor(gam_predictions, actual)^2
    
    gam_rmse_systolic <- c(gam_rmse_systolic, gam_rmse)
    gam_mae_systolic <- c(gam_mae_systolic, gam_mae)
    gam_r2_systolic <- c(gam_r2_systolic, gam_r2)
    #Multiple Linear Regression
    mlr_model <- lm(Systolic ~ Age + BMI + SocialStatus + Gender + SmokeStatus +
                      AlcoholPercentage + FibreEnergy + CarbohydrateEnergy + 
                      Identity + SodiumPotassiumRatio, data = data_train)
    
    mlr_predictions <- predict(mlr_model, newdata = data_test)
    
    mlr_rmse <- sqrt(mean((mlr_predictions - actual)^2))
    mlr_mae <- mean(abs(mlr_predictions - actual))
    mlr_r2 <- cor(mlr_predictions, actual)^2
    
    mlr_rmse_systolic <- c(mlr_rmse_systolic, mlr_rmse)
    mlr_mae_systolic <- c(mlr_mae_systolic, mlr_mae)
    mlr_r2_systolic <- c(mlr_r2_systolic, mlr_r2)
    #Robust Regression
    robust_model <- rlm(Systolic ~ Age + BMI + SocialStatus + Gender + SmokeStatus +
                          AlcoholPercentage + FibreEnergy + CarbohydrateEnergy + 
                          Identity + SodiumPotassiumRatio, data = data_train)
    
    robust_predictions <- predict(robust_model, newdata = data_test)
    
    robust_rmse <- sqrt(mean((robust_predictions - actual)^2))
    robust_mae <- mean(abs(robust_predictions - actual))
    robust_r2 <- cor(robust_predictions, actual)^2
    
    robust_rmse_systolic <- c(robust_rmse_systolic, robust_rmse)
    robust_mae_systolic <- c(robust_mae_systolic, robust_mae)
    robust_r2_systolic <- c(robust_r2_systolic, robust_r2)
  }
}

mean_systolic_gam_rmse <- mean(gam_rmse_systolic)
mean_systolic_gam_mae <- mean(gam_mae_systolic)
mean_systolic_gam_r2 <- mean(gam_r2_systolic)

mean_systolic_mlr_rmse <- mean(mlr_rmse_systolic)
mean_systolic_mlr_mae <- mean(mlr_mae_systolic)
mean_systolic_mlr_r2 <- mean(mlr_r2_systolic)

mean_systolic_robust_rmse <- mean(robust_rmse_systolic)
mean_systolic_robust_mae <- mean(robust_mae_systolic)
mean_systolic_robust_r2 <- mean(robust_r2_systolic)
Systolic_df <- data.frame(
    Model = c("GAM", "Multiple Linear", "Robust"),
    RMSE = c(mean_systolic_gam_rmse, mean_systolic_mlr_rmse, mean_systolic_robust_rmse),
    MAE = c(mean_systolic_gam_mae, mean_systolic_mlr_mae, mean_systolic_robust_mae),
    R2 = c(mean_systolic_gam_r2, mean_systolic_mlr_r2, mean_systolic_robust_r2)
)

colors <- c('#FF5179', '#BF59D9', '#E76BF3')

rmse_range <- c(min(Systolic_df$RMSE) * 0.95, max(Systolic_df$RMSE) * 1.05)
mae_range <- c(min(Systolic_df$MAE) * 0.95, max(Systolic_df$MAE) * 1.05)
r2_range <- c(min(Systolic_df$R2) * 0.95, max(Systolic_df$R2) * 1.05)

Systolic_rmse <- plot_ly(Systolic_df, x = ~Model, y = ~round(RMSE, 3), type = 'bar', color = ~Model, colors = colors,
               text = ~round(RMSE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
  layout(
    yaxis = list(title = 'RMSE', range = rmse_range),
    showlegend = FALSE,
    paper_bgcolor = 'rgba(245, 246, 249, 1)',
    plot_bgcolor = 'rgba(0, 0, 0, 0)')

Systolic_mae <- plot_ly(Systolic_df, x = ~Model, y = ~round(MAE, 3), type = 'bar', color = ~Model, colors = colors,
               text = ~round(MAE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
  layout(
    yaxis = list(title = 'MAE', range = mae_range),
    showlegend = FALSE,
    paper_bgcolor = 'rgba(245, 246, 249, 1)',
    plot_bgcolor = 'rgba(0, 0, 0, 0)')

Systolic_r2 <- plot_ly(Systolic_df, x = ~Model, y = ~round(R2, 3), type = 'bar', color = ~Model, colors = colors,
               text = ~round(R2, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
  layout(
    yaxis = list(title = 'R²', range = r2_range),
    showlegend = FALSE,
    paper_bgcolor = 'rgba(245, 246, 249, 1)',
    plot_bgcolor = 'rgba(0, 0, 0, 0)')

SystolicFig <- subplot(Systolic_rmse, Systolic_mae, Systolic_r2, nrows = 1, shareY = FALSE) %>%
  layout(
    title = list(
      text = "Figure 2: Model Comparison (Systolic)",
      font = list(size = 16),x = 0.5, y = 1.15),
    annotations = list(
      list(text = "A",x = 0, y = 1.05, xref = "paper", yref = "paper", showarrow = FALSE, font = list(size = 14, color = "black"), xanchor = "center"),
      list(text = "B", x = 0.35, y = 1.05, xref = "paper", yref = "paper", showarrow = FALSE, font = list(size = 14, color = "black"), xanchor = "center"),
      list(
        text = "C",
        x = 0.70,
        y = 1.05,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 14, color = "black"),
        xanchor = "center"),
      list(
        text = "A. RMSE comparison for Generalized Additive Model, Multiple Linear Regression and Robust Regression",
        x = 0,
        y = -0.15,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 12),
        xanchor = "left"
      ),
      list(
        text = "B. MAE comparison for Generalized Additive Model, Multiple Linear Regression and Robust Regression",
        x = 0,
        y = -0.2,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 12),
        xanchor = "left"
      ),
      list(
        text = "C. R² comparison for Generalized Additive Model, Multiple Linear Regression and Robust Regression",
        x = 0,
        y = -0.25,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 12),
        xanchor = "left"
      )
    ),
    margin = list(b = 140, t = 80)
  )

SystolicFig

Diastolic

set.seed(3888)
gam_rmse_diastolic <- numeric()
gam_mae_diastolic <- numeric()
gam_r2_diastolic <- numeric()

mlr_rmse_diastolic <- numeric()
mlr_mae_diastolic <- numeric()
mlr_r2_diastolic <- numeric()

robust_rmse_diastolic <- numeric()
robust_mae_diastolic <- numeric()
robust_r2_diastolic <- numeric()

for (r in 1:repeats) {
  folds <- sample(rep(1:k, length.out = n))
  
  for (i in 1:k) {
    train_idx <- which(folds != i)
    test_idx <- which(folds == i)
    data_train <- normalized_data[train_idx, ]
    data_test <- normalized_data[test_idx, ]
    
    actual <- data_test$Diastolic
    #GAM
    gam_model <- gam(Diastolic ~ s(Age) + s(BMI) + SocialStatus + Gender + SmokeStatus +
                       s(AlcoholPercentage) + s(FibreEnergy) + s(CarbohydrateEnergy) + 
                       Identity + s(SodiumPotassiumRatio),
                     data = data_train)
    
    gam_predictions <- predict(gam_model, newdata = data_test)
    
    gam_rmse <- sqrt(mean((gam_predictions - actual)^2))
    gam_mae <- mean(abs(gam_predictions - actual))
    gam_r2 <- cor(gam_predictions, actual)^2
    
    gam_rmse_diastolic <- c(gam_rmse_diastolic, gam_rmse)
    gam_mae_diastolic <- c(gam_mae_diastolic, gam_mae)
    gam_r2_diastolic <- c(gam_r2_diastolic, gam_r2)
    #Multiple Linear Regression
    mlr_model <- lm(Diastolic ~ Age + BMI + SocialStatus + Gender + SmokeStatus +
                      AlcoholPercentage + FibreEnergy + CarbohydrateEnergy + 
                      Identity + SodiumPotassiumRatio, data = data_train)
    
    mlr_predictions <- predict(mlr_model, newdata = data_test)
    
    mlr_rmse <- sqrt(mean((mlr_predictions - actual)^2))
    mlr_mae <- mean(abs(mlr_predictions - actual))
    mlr_r2 <- cor(mlr_predictions, actual)^2
    
    mlr_rmse_diastolic <- c(mlr_rmse_diastolic, mlr_rmse)
    mlr_mae_diastolic <- c(mlr_mae_diastolic, mlr_mae)
    mlr_r2_diastolic <- c(mlr_r2_diastolic, mlr_r2)
    #Robust Regression
    robust_model <- rlm(Diastolic ~ Age + BMI + SocialStatus + Gender + SmokeStatus +
                          AlcoholPercentage + FibreEnergy + CarbohydrateEnergy + 
                          Identity + SodiumPotassiumRatio, data = data_train)
    
    robust_predictions <- predict(robust_model, newdata = data_test)
    
    robust_rmse <- sqrt(mean((robust_predictions - actual)^2))
    robust_mae <- mean(abs(robust_predictions - actual))
    robust_r2 <- cor(robust_predictions, actual)^2
    
    robust_rmse_diastolic <- c(robust_rmse_diastolic, robust_rmse)
    robust_mae_diastolic <- c(robust_mae_diastolic, robust_mae)
    robust_r2_diastolic <- c(robust_r2_diastolic, robust_r2)
  }
}

mean_diastolic_gam_rmse <- mean(gam_rmse_diastolic)
mean_diastolic_gam_mae <- mean(gam_mae_diastolic)
mean_diastolic_gam_r2 <- mean(gam_r2_diastolic)

mean_diastolic_mlr_rmse <- mean(mlr_rmse_diastolic)
mean_diastolic_mlr_mae <- mean(mlr_mae_diastolic)
mean_diastolic_mlr_r2 <- mean(mlr_r2_diastolic)

mean_diastolic_robust_rmse <- mean(robust_rmse_diastolic)
mean_diastolic_robust_mae <- mean(robust_mae_diastolic)
mean_diastolic_robust_r2 <- mean(robust_r2_diastolic)
Diastolic_df <- data.frame(
    Model = c("GAM", "Multiple Linear", "Robust"),
    RMSE = c(mean_diastolic_gam_rmse, mean_diastolic_mlr_rmse, mean_diastolic_robust_rmse),
    MAE = c(mean_diastolic_gam_mae, mean_diastolic_mlr_mae, mean_diastolic_robust_mae),
    R2 = c(mean_diastolic_gam_r2, mean_diastolic_mlr_r2, mean_diastolic_robust_r2)
)

colors <- c('#FF5179', '#BF59D9', '#E76BF3')

rmse_range <- c(min(Diastolic_df$RMSE) * 0.95, max(Diastolic_df$RMSE) * 1.05)
mae_range <- c(min(Diastolic_df$MAE) * 0.95, max(Diastolic_df$MAE) * 1.05)
r2_range <- c(min(Diastolic_df$R2) * 0.95, max(Diastolic_df$R2) * 1.05)

Diastolic_rmse <- plot_ly(Diastolic_df, x = ~Model, y = ~round(RMSE, 3), type = 'bar', color = ~Model, colors = colors,
               text = ~round(RMSE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
  layout(
    yaxis = list(title = 'RMSE', range = rmse_range),
    showlegend = FALSE,
    paper_bgcolor = 'rgba(245, 246, 249, 1)',
    plot_bgcolor = 'rgba(0, 0, 0, 0)')

Diastolic_mae <- plot_ly(Diastolic_df, x = ~Model, y = ~round(MAE, 3), type = 'bar', color = ~Model, colors = colors,
               text = ~round(MAE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
  layout(
    yaxis = list(title = 'MAE', range = mae_range),
    showlegend = FALSE,
    paper_bgcolor = 'rgba(245, 246, 249, 1)',
    plot_bgcolor = 'rgba(0, 0, 0, 0)')

Diastolic_r2 <- plot_ly(Diastolic_df, x = ~Model, y = ~round(R2, 3), type = 'bar', color = ~Model, colors = colors,
               text = ~round(R2, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
  layout(
    yaxis = list(title = 'R²', range = r2_range),
    showlegend = FALSE,
    paper_bgcolor = 'rgba(245, 246, 249, 1)',
    plot_bgcolor = 'rgba(0, 0, 0, 0)')

DiastolicFig <- subplot(
  Diastolic_rmse, 
  Diastolic_mae, 
  Diastolic_r2, 
  nrows = 1, 
  shareY = FALSE
) %>%
  layout(
    title = list(
      text = "Figure 3: Model Comparison (Diastolic)",
      font = list(size = 16),x = 0.5, y = 1.15),
    annotations = list(
      list(text = "A",x = 0, y = 1.05, xref = "paper", yref = "paper", showarrow = FALSE, font = list(size = 14, color = "black"), xanchor = "center"),
      list(text = "B", x = 0.35, y = 1.05, xref = "paper", yref = "paper", showarrow = FALSE, font = list(size = 14, color = "black"), xanchor = "center"),
      list(
        text = "C",
        x = 0.70,
        y = 1.05,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 14, color = "black"),
        xanchor = "center"),
      list(
        text = "A. RMSE comparison for Generalized Additive Model, Multiple Linear Regression and Robust Regression",
        x = 0,
        y = -0.15,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 12),
        xanchor = "left"
      ),
      list(
        text = "B. MAE comparison for Generalized Additive Model, Multiple Linear Regression and Robust Regression",
        x = 0,
        y = -0.2,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 12),
        xanchor = "left"
      ),
      list(
        text = "C. R² comparison for Generalized Additive Model, Multiple Linear Regression and Robust Regression",
        x = 0,
        y = -0.25,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 12),
        xanchor = "left"
      )
    ),
    margin = list(b = 140, t = 80)
  )

DiastolicFig

Full model vs Reduced model

set.seed(3888)
k <- 5
repeats <- 3

gam_full_rmse_systolic <- numeric()
gam_full_mae_systolic <- numeric()
gam_full_r2_systolic <- numeric()

gam_reduced_rmse_systolic <- numeric()
gam_reduced_mae_systolic <- numeric()
gam_reduced_r2_systolic <- numeric()

n <- nrow(normalized_data)

for (r in 1:repeats) {
  folds <- sample(rep(1:k, length.out = n))
  
  for (i in 1:k) {
    train_idx <- which(folds != i)
    test_idx <- which(folds == i)
    
    data_train <- normalized_data[train_idx, ]
    data_test <- normalized_data[test_idx, ]
    
    actual <- data_test$Systolic
    #GAM full
    gam_full_model <- gam(Systolic ~ s(Age) + s(BMI) + SocialStatus + Gender + SmokeStatus +
                       s(AlcoholPercentage) + s(FibreEnergy) + s(CarbohydrateEnergy) + 
                       Identity + s(SodiumPotassiumRatio),
                     data = data_train)
    
    gam_full_predictions <- predict(gam_full_model, newdata = data_test)
    
    gam_full_rmse <- sqrt(mean((gam_full_predictions - actual)^2))
    gam_full_mae <- mean(abs(gam_full_predictions - actual))
    gam_full_r2 <- cor(gam_full_predictions, actual)^2
    
    gam_full_rmse_systolic <- c(gam_full_rmse_systolic, gam_full_rmse)
    gam_full_mae_systolic <- c(gam_full_mae_systolic, gam_full_mae)
    gam_full_r2_systolic <- c(gam_full_r2_systolic, gam_full_r2)
    
    #GAM reduced
    gam_reduced_model <- gam(Systolic ~ s(Age) + s(BMI) + SocialStatus + Gender + SmokeStatus +
                       s(AlcoholPercentage) + Identity + s(SodiumPotassiumRatio), data = data_train)
    gam_reduced_predictions <- predict(gam_reduced_model, newdata = data_test)
    
    gam_reduced_rmse <- sqrt(mean((gam_reduced_predictions - actual)^2))
    gam_reduced_mae <- mean(abs(gam_reduced_predictions - actual))
    gam_reduced_r2 <- cor(gam_reduced_predictions, actual)^2
    
    gam_reduced_rmse_systolic <- c(gam_reduced_rmse_systolic, gam_reduced_rmse)
    gam_reduced_mae_systolic <- c(gam_reduced_mae_systolic, gam_reduced_mae)
    gam_reduced_r2_systolic <- c(gam_reduced_r2_systolic, gam_reduced_r2)
  }
}

mean_systolic_gam_full_rmse <- mean(gam_full_rmse_systolic)
mean_systolic_gam_full_mae <- mean(gam_full_mae_systolic)
mean_systolic_gam_full_r2 <- mean(gam_full_r2_systolic)

mean_systolic_gam_reduced_rmse <- mean(gam_reduced_rmse_systolic)
mean_systolic_gam_reduced_mae <- mean(gam_reduced_mae_systolic)
mean_systolic_gam_reduced_r2 <- mean(gam_reduced_r2_systolic)
Systolic_df <- data.frame(
    Model = c("GAM Full", "GAM Reduced"),
    RMSE = c(mean_systolic_gam_full_rmse, mean_systolic_gam_reduced_rmse),
    MAE = c(mean_systolic_gam_full_mae, mean_systolic_gam_reduced_mae),
    R2 = c(mean_systolic_gam_full_r2, mean_systolic_gam_reduced_r2)
)

rmse_range <- c(min(Systolic_df$RMSE) * 0.95, max(Systolic_df$RMSE) * 1.05)
mae_range <- c(min(Systolic_df$MAE) * 0.95, max(Systolic_df$MAE) * 1.05)
r2_range <- c(min(Systolic_df$R2) * 0.95, max(Systolic_df$R2) * 1.05)

Systolic_rmse <- plot_ly(Systolic_df, x = ~Model, y = ~round(RMSE, 3), type = 'bar', color = ~Model, colors = colors,
               text = ~round(RMSE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
  layout(
    yaxis = list(title = 'RMSE', range = rmse_range),
    showlegend = FALSE,
    paper_bgcolor = 'rgba(245, 246, 249, 1)',
    plot_bgcolor = 'rgba(0, 0, 0, 0)')

Systolic_mae <- plot_ly(Systolic_df, x = ~Model, y = ~round(MAE, 3), type = 'bar', color = ~Model, colors = colors,
               text = ~round(MAE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
  layout(
    yaxis = list(title = 'MAE', range = mae_range),
    showlegend = FALSE,
    paper_bgcolor = 'rgba(245, 246, 249, 1)',
    plot_bgcolor = 'rgba(0, 0, 0, 0)')

Systolic_r2 <- plot_ly(Systolic_df, x = ~Model, y = ~round(R2, 3), type = 'bar', color = ~Model, colors = colors,
               text = ~round(R2, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
  layout(
    yaxis = list(title = 'R²', range = r2_range),
    showlegend = FALSE,
    paper_bgcolor = 'rgba(245, 246, 249, 1)',
    plot_bgcolor = 'rgba(0, 0, 0, 0)')

SystolicFig <- subplot(Systolic_rmse, Systolic_mae, Systolic_r2, nrows = 1, shareY = FALSE) %>%
  layout(
    title = list(
      text = "Figure 4: GAM Full Model* and Reduced Model** Comparison (Systolic)",
      font = list(size = 16),
      x = 0.5,
      y = 1.15
    ),
    annotations = list(
      list(
        text = "A",
        x = 0,
        y = 1.05,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 14, color = "black"),
        xanchor = "center"
      ),
      list(
        text = "B",
        x = 0.35,
        y = 1.05,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 14, color = "black"),
        xanchor = "center"
      ),
      list(
        text = "C",
        x = 0.70,
        y = 1.05,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 14, color = "black"),
        xanchor = "center"
      ),
      list(
        text = "A. RMSE comparison for Generalized Additive Model Full Model* and Reduced Model** Comparison (Systolic)",
        x = 0,
        y = -0.15,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 12),
        xanchor = "left"
      ),
      list(
        text = "B. MAE comparison for Generalized Additive Model Full Model* and Reduced Model** Comparison (Systolic)",
        x = 0,
        y = -0.2, 
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 12),
        xanchor = "left"
      ),
      list(
        text = "C. R² comparison for Generalized Additive Model Full Model* and Reduced Model** Comparison (Systolic)",
        x = 0,
        y = -0.25,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 12),
        xanchor = "left"
      ), list(
        text = "*Full Model: Model with all risk factors and carb and fiber",
        x = 0,
        y = -0.3,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 10),
        xanchor = "left"
      ), list(
        text = "**Reduced Model: Model with only risk factors",
        x = 0,
        y = -0.35,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 10),
        xanchor = "left"
      )
    ),
    margin = list(b = 140, t = 80)
  )

SystolicFig
set.seed(3888)
k <- 5
repeats <- 3

gam_full_rmse_diastolic <- numeric()
gam_full_mae_diastolic <- numeric()
gam_full_r2_diastolic <- numeric()

gam_reduced_rmse_diastolic <- numeric()
gam_reduced_mae_diastolic <- numeric()
gam_reduced_r2_diastolic <- numeric()

n <- nrow(normalized_data)

for (r in 1:repeats) {
  folds <- sample(rep(1:k, length.out = n))
  
  for (i in 1:k) {
    train_idx <- which(folds != i)
    test_idx <- which(folds == i)
    
    data_train <- normalized_data[train_idx, ]
    data_test <- normalized_data[test_idx, ]
    
    actual <- data_test$Diastolic
    #GAM full
    gam_full_model <- gam(Diastolic ~ s(Age) + s(BMI) + SocialStatus + Gender + SmokeStatus +
                       s(AlcoholPercentage) + s(FibreEnergy) + s(CarbohydrateEnergy) + 
                       Identity + s(SodiumPotassiumRatio),
                     data = data_train)
    
    gam_full_predictions <- predict(gam_full_model, newdata = data_test)
    
    gam_full_rmse <- sqrt(mean((gam_full_predictions - actual)^2))
    gam_full_mae <- mean(abs(gam_full_predictions - actual))
    gam_full_r2 <- cor(gam_full_predictions, actual)^2
    
    gam_full_rmse_diastolic <- c(gam_full_rmse_diastolic, gam_full_rmse)
    gam_full_mae_diastolic <- c(gam_full_mae_diastolic, gam_full_mae)
    gam_full_r2_diastolic <- c(gam_full_r2_diastolic, gam_full_r2)
    
    #GAM reduced
    gam_reduced_model <- gam(Diastolic ~ s(Age) + s(BMI) + SocialStatus + Gender + SmokeStatus +
                       s(AlcoholPercentage) + Identity + s(SodiumPotassiumRatio), data = data_train)
    gam_reduced_predictions <- predict(gam_reduced_model, newdata = data_test)
    
    gam_reduced_rmse <- sqrt(mean((gam_reduced_predictions - actual)^2))
    gam_reduced_mae <- mean(abs(gam_reduced_predictions - actual))
    gam_reduced_r2 <- cor(gam_reduced_predictions, actual)^2
    
    gam_reduced_rmse_diastolic <- c(gam_reduced_rmse_diastolic, gam_reduced_rmse)
    gam_reduced_mae_diastolic <- c(gam_reduced_mae_diastolic, gam_reduced_mae)
    gam_reduced_r2_diastolic <- c(gam_reduced_r2_diastolic, gam_reduced_r2)
  }
}

mean_diastolic_gam_full_rmse <- mean(gam_full_rmse_diastolic)
mean_diastolic_gam_full_mae <- mean(gam_full_mae_diastolic)
mean_diastolic_gam_full_r2 <- mean(gam_full_r2_diastolic)

mean_diastolic_gam_reduced_rmse <- mean(gam_reduced_rmse_diastolic)
mean_diastolic_gam_reduced_mae <- mean(gam_reduced_mae_diastolic)
mean_diastolic_gam_reduced_r2 <- mean(gam_reduced_r2_diastolic)
Diastolic_df <- data.frame(
    Model = c("GAM Full", "GAM Reduced"),
    RMSE = c(mean_diastolic_gam_full_rmse, mean_diastolic_gam_reduced_rmse),
    MAE = c(mean_diastolic_gam_full_mae, mean_diastolic_gam_reduced_mae),
    R2 = c(mean_diastolic_gam_full_r2, mean_diastolic_gam_reduced_r2)
)

rmse_range <- c(min(Diastolic_df$RMSE) * 0.95, max(Diastolic_df$RMSE) * 1.05)
mae_range <- c(min(Diastolic_df$MAE) * 0.95, max(Diastolic_df$MAE) * 1.05)
r2_range <- c(min(Diastolic_df$R2) * 0.95, max(Diastolic_df$R2) * 1.05)

Diastolic_rmse <- plot_ly(Diastolic_df, x = ~Model, y = ~round(RMSE, 3), type = 'bar', color = ~Model, colors = colors,
               text = ~round(RMSE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
  layout(
    yaxis = list(title = 'RMSE', range = rmse_range),
    showlegend = FALSE,
    paper_bgcolor = 'rgba(245, 246, 249, 1)',
    plot_bgcolor = 'rgba(0, 0, 0, 0)')

Diastolic_mae <- plot_ly(Diastolic_df, x = ~Model, y = ~round(MAE, 3), type = 'bar', color = ~Model, colors = colors,
               text = ~round(MAE, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
  layout(
    yaxis = list(title = 'MAE', range = mae_range),
    showlegend = FALSE,
    paper_bgcolor = 'rgba(245, 246, 249, 1)',
    plot_bgcolor = 'rgba(0, 0, 0, 0)')

Diastolic_r2 <- plot_ly(Diastolic_df, x = ~Model, y = ~round(R2, 3), type = 'bar', color = ~Model, colors = colors,
               text = ~round(R2, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
  layout(
    yaxis = list(title = 'R²', range = r2_range),
    showlegend = FALSE,
    paper_bgcolor = 'rgba(245, 246, 249, 1)',
    plot_bgcolor = 'rgba(0, 0, 0, 0)')

DiastolicFig <- subplot(Diastolic_rmse, Diastolic_mae, Diastolic_r2, nrows = 1, shareY = FALSE) %>%
  layout(title = list(
    text = "Figure 5: GAM Full Model* and Reduced Model** Comparison (Diastolic)",font = list(size = 16),x = 0.5,y = 1.15),
    annotations = list(
      list(text = "A",x = 0,y = 1.05,xref = "paper",yref = "paper",showarrow = FALSE,font = list(size = 14, color = "black"),xanchor = "center"),
      list(text = "B",x = 0.35, y = 1.05,xref = "paper",yref = "paper",showarrow = FALSE,font = list(size = 14, color = "black"),xanchor = "center"),
      list(text = "C",x = 0.70, y = 1.05,xref = "paper",yref = "paper",showarrow = FALSE,font = list(size = 14, color = "black"),xanchor = "center"),
      list(text = "A. RMSE comparison for Generalized Additive Model Full Model* and Reduced Model** Comparison (Diastolic)",x = 0,y = -0.15,xref = "paper",yref = "paper",showarrow = FALSE,font = list(size = 12),xanchor = "left"),
      list(text = "B. MAE comparison for Generalized Additive Model Full Model* and Reduced Model** Comparison (Diastolic)",x = 0,y = -0.2,xref = "paper",yref = "paper",showarrow = FALSE,font = list(size = 12),xanchor = "left"),
      list(
        text = "C. R² comparison for Generalized Additive Model Full Model* and Reduced Model** Comparison (Diastolic)",
        x = 0,
        y = -0.25,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 12),
        xanchor = "left"
      ), list(
        text = "*Full Model: Model with all risk factors and carb and fiber",
        x = 0,
        y = -0.3,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 10),
        xanchor = "left"
      ), list(
        text = "**Reduced Model: Model with only risk factors",
        x = 0,
        y = -0.35,
        xref = "paper",
        yref = "paper",
        showarrow = FALSE,
        font = list(size = 10),
        xanchor = "left"
      )
    ),
    margin = list(b = 140, t = 80)
  )

DiastolicFig